Loading one-to-one orthologs between P. barbatus and C. floridanus
path_to_pogodata <- paste0(path_to_repo,"/data/gordon_data/")
pogo.cflo <-
read.csv(paste0(path_to_pogodata, "pogo_cflo_orthologs.csv"),
header = T, stringsAsFactors = F, na.strings = c(""," ",".", "NA")) %>%
as_tibble() %>%
select(cflo_gene,pogo_gene) %>%
distinct()
# make a function that takes pogo names and returns cflo names
pogo_to_cflo <- function(geneset) {
cflo_genes <-
pogo.cflo %>%
filter(pogo_gene %in% geneset) %>%
pull(cflo_gene) %>%
unique() %>%
as.character()
return(cflo_genes)
}
# make a function that takes pogo names and returns cflo names
cflo_to_pogo <- function(geneset) {
pogo_genes <-
pogo.cflo %>%
filter(cflo_gene %in% geneset) %>%
pull(pogo_gene) %>%
unique() %>%
as.character()
return(pogo_genes)
}
Compare the gene co-expression networks of Cflo forager brains to that of Pbar foragers.
Specify the order of the samples
# SAMPLE NAME
## specify sample name
sample.name <- c("pbar_ld","pbar_dd","annot")
Load the expression (FPKM) data
# Specify the path to TC5 database
path_to_tc5_repo = "/Users/biplabendudas/Documents/GitHub/Das_et_al_2021"
# Load the TC5 database
tc5.db <- dbConnect(RSQLite::SQLite(), paste0(path_to_tc5_repo,"/data/TC5_data.db"))
# LOAD Pbar DATABASES (TC9)
## 1. TC9_data.db
db <- dbConnect(RSQLite::SQLite(),
paste0(path_to_repo, "/data/databases/TC9_data.db"))
loading data…
# Pbar - LD
#
# extract the (gene-expr X time-point) data
pbar.dat <-
db %>%
tbl(., paste0(sample.name[1] ,"_fpkm")) %>%
select(gene_name, everything()) %>%
collect()
# count the number of time points that has ≥ 1 FPKM
n.expressed <- apply(pbar.dat[-1], 1, function(x) sum(x >= 1))
# subset the data and only keep the filtered genes
pbar.dat <- pbar.dat[which(n.expressed >=6),] # 9925 genes
# Pbar - DD
#
# extract the (gene-expr X time-point) data
pbar.dat.dd <-
db %>%
tbl(., paste0(sample.name[2] ,"_fpkm")) %>%
select(gene_name, everything()) %>%
collect()
#
# count the number of time points that has ≥ 1 FPKM
n.expressed <- apply(pbar.dat.dd[-1], 1, function(x) sum(x >= 1))
# subset the data and only keep the filtered genes
pbar.dat.dd <- pbar.dat.dd[which(n.expressed >=6),] # 9827 genes
# Cflo - foragers - LD
#
# extract the (gene-expr X time-point) data
cflo.dat <-
tc5.db %>%
tbl(., paste0(sample.name[3] ,"_fpkm")) %>%
select(gene_name, X2F:X24F) %>%
collect()
#
# count the number of time points that has ≥ 1 FPKM
n.expressed <- apply(cflo.dat[-1], 1, function(x) sum(x >= 1))
# subset the data and only keep the filtered genes
cflo.dat <- cflo.dat[which(n.expressed >=6),] # 9239 genes
# CONVERT Cflo gene names to Pbar genes
#
# 8120 of the 9239 has an ortholog; 7982 are unique
#
cflo.dat <-
cflo.dat %>%
left_join(pogo.cflo, by=c("gene_name"="cflo_gene")) %>%
filter(!is.na(pogo_gene)) %>%
select(-gene_name) %>%
select(gene_name = pogo_gene, everything()) %>%
# summarize the expression levels of genes with duplicate expression
pivot_longer(cols = X2F:X24F) %>%
group_by(gene_name, name) %>%
summarize(value=mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider()
# Which genes are commonly expressed?
common.genes <- intersect(intersect(pbar.dat$gene_name,
pbar.dat.dd$gene_name),
cflo.dat$gene_name)
# n = 7675 genes
# Subset the dataframes to keep only these 9405 genes (common.genes)
#
pbar.dat <-
pbar.dat %>%
filter(gene_name %in% common.genes)
#
pbar.dat.dd <-
pbar.dat.dd %>%
filter(gene_name %in% common.genes)
#
cflo.dat <-
cflo.dat %>%
filter(gene_name %in% common.genes)
Note, the program will throw an error if there is a mismatch between the lengths of the two datasets.
# We work with two sets:
nSets = 2;
# For easier labeling of plots, create a vector holding descriptive names of the two sets.
setLabels = c("Pbar-LD", "Cflo-LD")
shortLabels = c("Pbar", "Cflo")
# Form multi-set expression data: columns starting from 9 contain actual expression data.
multiExpr = vector(mode = "list", length = nSets)
multiExpr[[1]] = list(data = as.data.frame(t(log2(pbar.dat[-c(1)]+1))));
names(multiExpr[[1]]$data) = pbar.dat$gene_name;
rownames(multiExpr[[1]]$data) = names(pbar.dat)[-c(1)];
multiExpr[[2]] = list(data = as.data.frame(t(log2(cflo.dat[-c(1)]+1))));
names(multiExpr[[2]]$data) = cflo.dat$gene_name;
rownames(multiExpr[[2]]$data) = names(cflo.dat)[-c(1)];
# Check that the data has the correct format for many functions operating on multiple sets:
exprSize = checkSets(multiExpr)
# Check that all genes and samples have sufficiently low numbers of missing values.
gsg = goodSamplesGenesMS(multiExpr, verbose = 3);
## Flagging genes and samples with too many missing values...
## ..step 1
## ..bad gene count: 0, bad sample counts: 0, 0
writeLines("All samples okay?")
## All samples okay?
gsg$allOK
## [1] TRUE
# Use the following code to check the data
# multiExpr[[1]]$data[,1:3]
# We now cluster the samples on their Euclidean distance, separately in each set.
sampleTrees = list()
for (set in 1:nSets) {
sampleTrees[[set]] = hclust(dist(multiExpr[[set]]$data), method = "average")
}
# png(file = paste0(path_to_repo,"/results/temp_files/Plots/TC6_SampleClustering.png"),
# width = 20, height = 30, units = "cm", res = 300)
par(mfrow=c(1,2))
# par(mar = c(0, 4, 2, 0))
for (set in 1:nSets)
plot(sampleTrees[[set]], main = paste("Sample clustering in", setLabels[set]),
xlab="", sub="", cex = 0.7)
# dev.off()
# Define data set dimensions
nGenes = exprSize$nGenes
nSamples = exprSize$nSamples
# save(multiExpr,
# # Traits,
# nGenes, nSamples, setLabels, shortLabels, exprSize,
# file = "TC6_Consensus-dataInput.RData")
# Expression data
multiExpr_1 = list(control = list(data = multiExpr[[1]]$data),
ophio = list(data = multiExpr[[2]]$data));
# module identity for Cflo-control gene
## specify the location of the csv that has this info.
file_loc <- "/results/wgcna_files/res/pbar_ld_module_identity_new_labels.csv"
## load it into R
mod.identity.all <- read.csv(paste0(path_to_repo,file_loc), stringsAsFactors = F)
writeLines("there are 9925 genes in the original network that were classified into 11 modules")
## there are 9925 genes in the original network that were classified into 11 modules
## filter to keep only the genes that we are working with
which.genes <- common.genes
mod.identity <-
mod.identity.all %>%
filter(gene_name %in% which.genes) %>%
select(gene_name,
module_identity=old_labels) %>%
# !!this step is necessary!! #
arrange(gene_name)
## specify the module identity of the genes
moduleColors <- mod.identity %>% pull(module_identity)
multiColor = list(control = moduleColors);
# # Run module preservation function
# mp = modulePreservation(multiExpr_1, multiColor,
# referenceNetworks = 1,
# nPermutations = 200,
# calculateQvalue = TRUE,
# randomSeed = 1,
# quickCor = 0,
# verbose = 3)
# save(mp, file = paste0(path_to_repo,
# "/results/module_preservation/pbar_cflo/modulePreservation_Pbar_v_Cflo.RData"));
## or load the results for faster access
load(file = paste0(path_to_repo,
"/results/module_preservation/pbar_cflo/modulePreservation_Pbar_v_Cflo.RData"));
ref = 1
test = 2
statsObs = cbind(mp$quality$observed[[ref]][[test]], mp$preservation$observed[[ref]][[test]])
statsZ = cbind(mp$quality$Z[[ref]][[test]][,-1], mp$preservation$Z[[ref]][[test]][,-1]);
# Compare preservation to quality:
z.stats <- cbind(statsObs[, c("moduleSize", "medianRank.pres", "medianRank.qual")],
signif(statsZ[, c("Zsummary.pres", "Zsummary.qual")], 2))
z.stats <-
z.stats %>%
rownames_to_column("old_labels") %>%
left_join(unique(mod.identity.all[-1]), by="old_labels") %>%
select(-1) %>%
select(module_name = module_identity,
module_size = moduleSize,
everything())
mains = c("Preservation Median rank", "Preservation Zsummary")
pd <- position_dodge(0.5)
z.stats %>%
mutate(module_name = factor(module_name, levels = unique(mod.identity.all$module_identity))) %>%
na.omit() %>%
# ## PRESERVATION MEDIAN RANK
# ggplot(aes(x=log10(module_size), y=medianRank.pres)) +
## PRESERVATION Z-SUMMARY
ggplot(aes(x=log10(module_size), y=Zsummary.pres)) +
geom_hline(yintercept = c(2,10), col="darkred", alpha=0.5) +
geom_point(alpha=0.5, size=15, col="lightgrey", position = pd) +
geom_text(aes(label=module_name), check_overlap = T) +
theme_Publication(20) +
scale_colour_Publication() +
scale_x_continuous(limits = c(0,max(log10(1000))+0.5),
breaks = c(0,1,2,3),
labels = c("0","10","100","1000")) +
xlab("module size (genes)") +
ylab(mains[2]) +
ggtitle("")
#
# # Module labels and module sizes are also contained in the results
# modColors = rownames(mp$preservation$observed[[ref]][[test]])
# moduleSizes = mp$preservation$Z[[ref]][[test]][, 1];
#
# # leave grey and gold modules out
# plotMods = !(modColors %in% c("grey","gold"));
#
# # Text labels for points
# text = modColors[plotMods];
#
# # Auxiliary convenience variable
# plotData = cbind(mp$preservation$observed[[ref]][[test]][, 2], mp$preservation$Z[[ref]][[test]][, 2])
#
# # Plot
# mains = c("Preservation Median rank", "Preservation Zsummary");
# # sizeGrWindow(10, 5);
# par(mfrow = c(1,2))
# par(mar = c(4.5,4.5,2.5,1))
# for (p in 1:2)
# {
# min = min(plotData[, p], na.rm = TRUE);
# max = max(plotData[, p], na.rm = TRUE);
# # Adjust ploting ranges appropriately
# if (p==2)
# {
# if (min > -max/10) min = -max/10
# ylim = c(min - 0.1 * (max-min), max + 0.1 * (max-min))
# } else
# ylim = c(max + 0.1 * (max-min), min - 0.1 * (max-min))
# plot(moduleSizes[plotMods], plotData[plotMods, p], col = 1, bg = modColors[plotMods], pch = 21,
# main = mains[p],
# cex = 2.5,
# alpha=0.75,
# ylab = mains[p], xlab = "Module size", log = "x",
# ylim = ylim,
# xlim = c(1, 4000), cex.lab = 1.2, cex.axis = 1.2, cex.main =1.4)
# if(p==1)
# text(moduleSizes[plotMods], plotData[plotMods, p], labels=modColors[plotMods], cex=0.75, font=0.75, pos=1)
# if (p==2) {
# abline(h=0)
# abline(h=c(2,10), col = "darkred", lty = 2)
# # abline(h=10, col = "darkgreen", lty = 2)
# # abline(h=30, col = "darkred", lty=2)
# } }
NOTES:
“The “gold” module consists of 1000 randomly selected genes that represent a sample of the whole network”.
# devtools::install_github("biplabendu/timecourseRnaseq",
# repos = "https://mran.microsoft.com/snapshot/2021-09-01")
writeLines("Module: 2")
## Module: 2
mod.identity.all %>%
filter(module_identity == "C2") %>%
pull(gene_name) %>%
unique() %>%
check_enrichment(org="pbar",
what = "pfams",
verbose = T,
expand = T,
plot = T,
bg=mod.identity.all %>% pull(gene_name)) %>%
pull(gene_name) %>%
stacked.zplot(text_size=35) %>%
multi.plot(rows=2,cols=1)
## [1] "Loading annotation file for Pogonomyrmex barbatus"
## [1] "Done."
## Number of genes in background geneset: 9252
## Number of genes in the test set: 1855
## --------------------------------
## Number of pfams terms in background geneset: 4496
## Number of pfams terms (at least 5genes) in background geneset: 495
## Number of pfams terms (at least 5 genes) in test set: 91
## [1] "Testing for enrichment..."
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
writeLines("Module: 1")
## Module: 1
mod.identity.all %>%
filter(module_identity == "C1") %>%
pull(gene_name) %>%
unique() %>%
check_enrichment(org="pbar",
what = "pfams",
verbose = T,
expand = T,
plot = T,
bg=mod.identity.all %>% pull(gene_name)) %>%
pull(gene_name) %>%
stacked.zplot(text_size=35) %>%
multi.plot(rows=2,cols=1)
## [1] "Loading annotation file for Pogonomyrmex barbatus"
## [1] "Done."
## Number of genes in background geneset: 9252
## Number of genes in the test set: 810
## --------------------------------
## Number of pfams terms in background geneset: 4496
## Number of pfams terms (at least 5genes) in background geneset: 495
## Number of pfams terms (at least 5 genes) in test set: 36
## [1] "Testing for enrichment..."
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
writeLines("Module: 10")
## Module: 10
mod.identity.all %>%
filter(module_identity == "C10") %>%
pull(gene_name) %>%
unique() %>%
check_enrichment(org="pbar",
what = "pfams",
verbose = T,
expand = T,
plot = T,
bg=mod.identity.all %>% pull(gene_name)) %>%
pull(gene_name) %>%
stacked.zplot(text_size=35) %>%
multi.plot(rows=2,cols=1)
## [1] "Loading annotation file for Pogonomyrmex barbatus"
## [1] "Done."
## Number of genes in background geneset: 9252
## Number of genes in the test set: 2225
## --------------------------------
## Number of pfams terms in background geneset: 4496
## Number of pfams terms (at least 5genes) in background geneset: 495
## Number of pfams terms (at least 5 genes) in test set: 41
## [1] "Testing for enrichment..."
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
writeLines("Module: 11")
## Module: 11
mod.identity.all %>%
filter(module_identity == "C11") %>%
pull(gene_name) %>%
unique() %>%
check_enrichment(org="pbar",
what = "pfams",
verbose = T,
expand = T,
plot = T,
bg=mod.identity.all %>% pull(gene_name)) %>%
pull(gene_name) %>%
stacked.zplot(text_size=35) %>%
multi.plot(rows=2,cols=1)
## [1] "Loading annotation file for Pogonomyrmex barbatus"
## [1] "Done."
## Number of genes in background geneset: 9252
## Number of genes in the test set: 2030
## --------------------------------
## Number of pfams terms in background geneset: 4496
## Number of pfams terms (at least 5genes) in background geneset: 495
## Number of pfams terms (at least 5 genes) in test set: 50
## [1] "Testing for enrichment..."
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]